### Load required libraries
library(readxl)
library(tidyverse)
library(dplyr)
library(knitr)
library(DT)
library(httr)
library(jsonlite)
### Load functions
source(file = "../R/02_functions.R")
### Define paths
current_dir <- getwd()
# MAF-like file
maf_data <- "../data/_raw/41467_2017_1460_MOESM6_ESM_somatic_mutations.xlsx"
maf_path <- file.path(current_dir,
maf_data)
### Read data
maf_df <- read_excel(maf_path,
skip=1,
col_names=TRUE)
See mutations per
patient
unique_tumor_counts <- maf_df %>%
dplyr::count(tumor_name,
sort = TRUE) # Count unique values and sort
# Display the result
datatable(unique_tumor_counts,
extensions = 'Buttons',
options = list(
dom = 'Bfrtip',
buttons = c('copy', 'excel', 'csv'),
scrollX=TRUE,
pageLength=10,
columnDefs = list(list(
targets = "_all",
render = JS(
"function(data, type, row, meta) {",
" return data === null ? 'NA' : data;",
"}"
)
))
),
caption = "Mutation counts per patient"
)
Pre-processing
# Filter rows where Entrez_Gene_Id is 0
missing_entrez_df <- maf_df %>%
filter(Entrez_Gene_Id == 0)
# Filter rows where Hugo_Symbol is Unknown
missing_hugo_df <- maf_df %>%
filter(Hugo_Symbol == 'Unknown')
n_rows_1 <- nrow(missing_entrez_df)
n_rows_2 <- nrow(missing_hugo_df)
print(paste("Number of genomic regions missing an Entrez ID:", n_rows_1))
## [1] "Number of genomic regions missing an Entrez ID: 172"
print(paste("Number of genomic regions missing Hugo Symbol:", n_rows_2))
## [1] "Number of genomic regions missing Hugo Symbol: 957"
Get the Hugo Symbols
& Ensembl Gene IDs from Entrez IDs
# Connect to the Ensembl BioMart database
ensembl = biomaRt::useMart(biomart="ENSEMBL_MART_ENSEMBL",
host="https://grch37.ensembl.org",
path="/biomart/martservice",
dataset="hsapiens_gene_ensembl")
# Extracting non-NA and non-zero values and ensuring they are unique
entrez_ids <- unique(na.omit(maf_df$Entrez_Gene_Id[maf_df$Entrez_Gene_Id != 0]))
# Ensure that entrez_ids is a character vector
entrez_ids <- as.character(entrez_ids)
# Retrieve Hugo symbols & Ensembl gene ids
reannotated_df <- biomaRt::getBM(attributes = c("entrezgene_id",
"hgnc_symbol",
"ensembl_gene_id"),
filters = "entrezgene_id",
values = entrez_ids,
mart = ensembl)
# Rename columns so that they match maf_df
# reannotated_df <- reannotated_df %>%
# dplyr::rename(
# Entrez_Gene_Id = entrezgene_id,
# Ensembl_Gene_Id = ensembl_gene_id
# )
# Left join reannotated_df with maf_df
maf_df_updated <- maf_df %>%
left_join(reannotated_df, by = c("Entrez_Gene_Id" = "entrezgene_id"))
## Warning in left_join(., reannotated_df, by = c(Entrez_Gene_Id = "entrezgene_id")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 6 of `x` matches multiple rows in `y`.
## ℹ Row 3125 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
# View the updated dataframe
DT::datatable(maf_df_updated,
extensions = c('Buttons'),
options = list(
dom = 'Bfrtip',
buttons = c('copy', 'excel', 'csv'),
scrollX=TRUE,
pageLength=10,
columnDefs = list(list(
targets = "_all",
render = JS(
"function(data, type, row, meta) {",
" return data === null ? 'NA' : data;",
"}"
)
))
),
caption = "MAF_df reannotated"
)
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
# Count the number of unique values in the ensembl_gene_id column
num_unique_values <- reannotated_df %>%
summarise(unique_count = n_distinct(entrezgene_id)) %>%
pull(unique_count)
# Print the number of unique values
print(paste("Number of unique Entrez Gene IDs in reannotated_df:", num_unique_values))
## [1] "Number of unique Entrez Gene IDs in reannotated_df: 6629"
# Search for a particular Entrez_Gene_Id
reannotated_df %>%
filter(entrezgene_id == "100526835")
## entrezgene_id hgnc_symbol ensembl_gene_id
## 1 100526835 FPGT-TNNI3K ENSG00000259030
## 2 100526835 TNNI3K ENSG00000116783
Export info to use
Ensembl VEP (web version)
### Extract info in correct format for VEP input
api_input <- maf_df %>%
select(Chromosome,
Start_position,
End_position,
ref_allele,
alt_allele,
Strand) %>%
mutate(Allele_ref_alt = paste(ref_allele,
alt_allele,
sep = "/")) %>%
mutate(API_info = paste(Chromosome,
Start_position,
End_position,
Allele_ref_alt,
Strand,
sep = " "))
### Export the info to a file and use the Web VEP
vep_input_data <- "../data/vep_input_grch37.txt"
vep_file_path <- file.path(current_dir,
vep_input_data)
write.table(api_input$API_info, file = vep_file_path, row.names = FALSE, col.names = FALSE, quote = FALSE)
Run VEP and explore
output
### Define path
vep_out_data <- "../data/vep_output_grch37_01.txt"
vep_path <- file.path(current_dir,
vep_out_data)
### Read data
vep_df <- read.csv(vep_path, header = TRUE, sep = "\t")
### Display result
datatable(vep_df,
extensions = c('Buttons'),
options = list(
dom = 'Bfrtip',
buttons = c('copy', 'excel', 'csv'),
scrollX=TRUE,
pageLength=10,
columnDefs = list(list(
targets = "_all",
render = JS(
"function(data, type, row, meta) {",
" return data === null ? 'NA' : data;",
"}"
)
))
),
caption = "VEP output"
)